From edc1d27233dbbbcb2182e2c1098b9cf9b5e9450b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 7 Jan 2024 00:18:04 +0100 Subject: [PATCH] Changes by create-pull-request action (#36) Automated changes by [create-pull-request](https://github.com/peter-evans/create-pull-request) GitHub action Co-authored-by: StefanBRas --- .../_stubs/0.19.17/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.19.17/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.19.17/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.19.17/polars/series/series | 4988 ++++++++++ .../_stubs/0.19.18/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.19.18/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.19.18/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.19.18/polars/series/series | 4988 ++++++++++ .../_stubs/0.19.19/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.19.19/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.19.19/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.19.19/polars/series/series | 4988 ++++++++++ .../_stubs/0.20.0/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.20.0/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.20.0/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.20.0/polars/series/series | 4988 ++++++++++ .../_stubs/0.20.1/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.20.1/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.20.1/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.20.1/polars/series/series | 4988 ++++++++++ .../_stubs/0.20.2/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.20.2/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.20.2/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.20.2/polars/series/series | 4988 ++++++++++ .../_stubs/0.20.3/polars/dataframe/frame | 6977 ++++++++++++++ .../_stubs/0.20.3/polars/expr/expr | 8289 +++++++++++++++++ .../_stubs/0.20.3/polars/lazyframe/frame | 4211 +++++++++ .../_stubs/0.20.3/polars/series/series | 4988 ++++++++++ 28 files changed, 171255 insertions(+) create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame create mode 100644 polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame new file mode 100644 index 0000000..562effd --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame @@ -0,0 +1,6977 @@ +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: Incomplete + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr new file mode 100644 index 0000000..5131d44 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr @@ -0,0 +1,8289 @@ +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. Accepts expression input. + Non-expression inputs are parsed as literals. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> df.with_columns( + ... pl.col("country_code") + ... .replace(country_code_map, default=None) + ... .alias("replaced") + ... ) + shape: (4, 2) + ┌──────────────┬─────────────┐ + │ country_code ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════════════╪═════════════╡ + │ FR ┆ France │ + │ ES ┆ null │ + │ DE ┆ Germany │ + │ null ┆ unspecified │ + └──────────────┴─────────────┘ + + The return type can be overridden with the `return_dtype` argument. + + >>> df = df.with_row_count() + >>> df.select( + ... "row_nr", + ... pl.col("row_nr") + ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + ... .alias("replaced"), + ... ) + shape: (4, 2) + ┌────────┬──────────┐ + │ row_nr ┆ replaced │ + │ --- ┆ --- │ + │ u32 ┆ u8 │ + ╞════════╪══════════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 0 │ + └────────┴──────────┘ + + To reference other columns as a `default` value, a struct column must be + constructed first. The first field must be the column in which values are + replaced. The other columns can be used in the default expression. + + >>> df.with_columns( + ... pl.struct("country_code", "row_nr") + ... .replace( + ... mapping=country_code_map, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... .alias("replaced") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬─────────────┐ + │ row_nr ┆ country_code ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ ES ┆ 1 │ + │ 2 ┆ DE ┆ Germany │ + │ 3 ┆ null ┆ unspecified │ + └────────┴──────────────┴─────────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame new file mode 100644 index 0000000..561f5b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame @@ -0,0 +1,4211 @@ +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series new file mode 100644 index 0000000..4a40006 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series @@ -0,0 +1,4988 @@ +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, mapping: dict[Any, Any]) -> Self: + ''' + Replace values according to the given mapping. + + Needs a global string cache for lazily evaluated queries on columns of + type `Categorical`. + + Parameters + ---------- + mapping + Mapping of values to their replacement. + default + Value to use when the mapping does not contain the lookup value. + Defaults to keeping the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + str.replace + + Examples + -------- + Replace a single value by another value. Values not in the mapping remain + unchanged. + + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.replace({2: 100}) + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values. Specify a default to set values not in the given map + to the default value. + + >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) + >>> country_code_map = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "unspecified", + ... } + >>> s.replace(country_code_map, default=None) + shape: (4,) + Series: \'country_code\' [str] + [ + "France" + null + "Germany" + "unspecified" + ] + + The return type can be overridden with the `return_dtype` argument. + + >>> s = pl.Series("a", [0, 1, 2, 3]) + >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + shape: (4,) + Series: \'a\' [u8] + [ + 0 + 10 + 20 + 0 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit."""