diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/dataframe/frame.pyi new file mode 100644 index 0000000..32475a4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/dataframe/frame.pyi @@ -0,0 +1,7409 @@ +#: version 0.20.23 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from pathlib import Path +from polars._utils.construction.dataframe import ( + arrow_to_pydf as arrow_to_pydf, + dataframe_to_pydf as dataframe_to_pydf, + dict_to_pydf as dict_to_pydf, + iterable_to_pydf as iterable_to_pydf, + numpy_to_pydf as numpy_to_pydf, + pandas_to_pydf as pandas_to_pydf, + sequence_to_pydf as sequence_to_pydf, + series_to_pydf as series_to_pydf, +) +from polars._utils.construction.other import numpy_to_idxs as numpy_to_idxs +from polars._utils.convert import parse_as_duration_string as parse_as_duration_string +from polars._utils.deprecation import ( + deprecate_function as deprecate_function, + deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, + deprecate_parameter_as_positional as deprecate_parameter_as_positional, + deprecate_renamed_function as deprecate_renamed_function, + deprecate_renamed_parameter as deprecate_renamed_parameter, + deprecate_saturating as deprecate_saturating, + issue_deprecation_warning as issue_deprecation_warning, +) +from polars._utils.parse_expr_input import parse_as_expression as parse_as_expression +from polars._utils.unstable import ( + issue_unstable_warning as issue_unstable_warning, + unstable as unstable, +) +from polars._utils.various import ( + is_bool_sequence as is_bool_sequence, + is_int_sequence as is_int_sequence, + is_str_sequence as is_str_sequence, + normalize_filepath as normalize_filepath, + parse_version as parse_version, + range_to_slice as range_to_slice, + scale_bytes as scale_bytes, + warn_null_comparison as warn_null_comparison, +) +from polars._utils.wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import ( + DynamicGroupBy as DynamicGroupBy, + GroupBy as GroupBy, + RollingGroupBy as RollingGroupBy, +) +from polars.datatypes.classes import ( + Boolean as Boolean, + Float64 as Float64, + Object as Object, + String as String, +) +from polars.dependencies import ( + _check_for_numpy as _check_for_numpy, + _check_for_pandas as _check_for_pandas, + _check_for_pyarrow as _check_for_pyarrow, + hvplot as hvplot, + import_optional as import_optional, +) +from polars.exceptions import ( + ModuleUpgradeRequired as ModuleUpgradeRequired, + NoRowsReturnedError as NoRowsReturnedError, + TooManyRowsReturnedError as TooManyRowsReturnedError, +) +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import ( + _XLFormatCache as _XLFormatCache, + _unpack_multi_column_dict as _unpack_multi_column_dict, + _xl_apply_conditional_formats as _xl_apply_conditional_formats, + _xl_inject_sparklines as _xl_inject_sparklines, + _xl_setup_table_columns as _xl_setup_table_columns, + _xl_setup_table_options as _xl_setup_table_options, + _xl_setup_workbook as _xl_setup_workbook, + _xl_unique_table_name as _xl_unique_table_name, +) +from polars.selectors import ( + _expand_selector_dicts as _expand_selector_dicts, + _expand_selectors as _expand_selectors, +) +from polars.slice import PolarsSlice as PolarsSlice +from typing import ( + Any, + Callable, + ClassVar as _ClassVar, + Collection, + IO, + Iterable, + Iterator, + Mapping, + NoReturn, + Sequence, +) + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_HVPLOT_AVAILABLE: bool +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method +_write_clipboard_string: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__( + self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ... + ) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_arrow( + cls, data: pa.Table | pa.RecordBatch, schema: SchemaDefinition | None = ... + ) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow Table, RecordBatch, or sequence of sequences + Data representing an Arrow Table or RecordBatch. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to( + self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType + ) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__( + self, + item: str + | int + | np.ndarray[Any, Any] + | MultiColSelector + | tuple[int, MultiColSelector] + | tuple[MultiRowSelector, MultiColSelector] + | tuple[MultiRowSelector, int | str] + | tuple[int, int | str], + ) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__( + self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any + ) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + """ + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + """ + def to_arrow(self) -> pa.Table: + """ + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + """ + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + """ + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + """ + def to_dicts(self) -> list[dict[str, Any]]: + """ + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + """ + def to_numpy(self) -> np.ndarray[Any, Any]: + """ + Convert this DataFrame to a NumPy ndarray. + + Parameters + ---------- + structured + Return a `structured array`_ with a data type that corresponds to the + DataFrame schema. If set to `False` (default), a 2D ndarray is + returned instead. + + .. _structured array: https://numpy.org/doc/stable/user/basics.rec.html + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + writable + Ensure the resulting array is writable. This will force a copy of the data + if the array was created without copy, as the underlying Arrow data is + immutable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + """ + Convert this DataFrame to a pandas DataFrame. + + This operation copies data if `use_pyarrow_extension_array` is not enabled. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow-backed extension arrays instead of NumPy arrays for the columns + of the pandas DataFrame. This allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy if those operations are not supported by PyArrow + compute functions. + **kwargs + Additional keyword arguments to be passed to + :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_pandas() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + + Null values in numeric columns are converted to `NaN`. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6.0, None, 8.0], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> df.to_pandas() + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + + Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns + backed by PyArrow extension arrays. This will preserve null values. + + >>> df.to_pandas(use_pyarrow_extension_array=True) + foo bar ham + 0 1 6.0 + 1 2 b + 2 8.0 c + >>> _.dtypes + foo int64[pyarrow] + bar double[pyarrow] + ham large_string[pyarrow] + dtype: object + """ + def _to_pandas_with_object_columns(self, **kwargs: Any) -> pd.DataFrame: ... + def _to_pandas_without_object_columns(self, df: DataFrame, **kwargs: Any) -> pd.DataFrame: ... + def to_series(self, index: int = ...) -> Series: + """ + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + """ + def to_init_repr(self, n: int = ...) -> str: + """ + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.String), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + """ + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + """ + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + """ + def write_csv(self, file: str | Path | IO[str] | IO[bytes] | None = ...) -> str | None: + """ + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + """ + def write_clipboard(self, **kwargs: Any) -> None: + """ + Copy `DataFrame` in csv format to the system clipboard with `write_csv`. + + Useful for pasting into Excel or other similar spreadsheet software. + + Parameters + ---------- + separator + Separate CSV fields with this symbol. + kwargs + Additional arguments to pass to `write_csv`. + + See Also + -------- + polars.read_clipboard: Read a DataFrame from the clipboard. + write_csv: Write to comma-separated values (CSV) file. + """ + def write_avro( + self, file: str | Path | IO[bytes], compression: AvroCompression = ..., name: str = ... + ) -> None: + """ + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + """ + def write_excel( + self, workbook: Workbook | IO[bytes] | Path | str | None = ..., worksheet: str | None = ... + ) -> Workbook: + """ + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is initialized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + """ + def write_ipc( + self, file: str | Path | IO[bytes] | None, compression: IpcCompression = ... + ) -> BytesIO | None: + """ + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + future + Setting this to `True` will write Polars\' internal data structures that + might not be available by other Arrow implementations. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + """ + def write_ipc_stream( + self, file: str | Path | IO[bytes] | None, compression: IpcCompression = ... + ) -> BytesIO | None: + """ + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + """ + def write_parquet(self, file: str | Path | BytesIO) -> None: + """ + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This is the default behavior. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + """ + def write_database(self, table_name: str, connection: str) -> int: + """ + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + """ + def write_delta( + self, target: str | Path | deltalake.DeltaTable + ) -> deltalake.table.TableMerger | None: + """ + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + + .. deprecated:: 0.20.14 + Use the parameter `delta_write_options` instead and pass + `{"schema_mode": "overwrite"}`. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, specifying the + `schema_mode` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... mode="overwrite", + ... delta_write_options={"schema_mode": "overwrite"}, + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + """ + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + """ + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], + ... ) + >>> df.estimated_size() + 17888890 + >>> df.estimated_size("mb") + 17.0601749420166 + """ + def transpose(self) -> Self: + """ + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["x", "y", "z"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["x", "y", "z"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + """ + def reverse(self) -> DataFrame: + """ + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + """ + def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame: + """ + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name, or a function + that takes the old name as input and returns the new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + >>> df.rename(lambda column_name: "c" + column_name[1:]) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ coo ┆ car ┆ cam │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + """ + def insert_column(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + """ + def filter( + self, + *predicates: IntoExprColumn + | Iterable[IntoExprColumn] + | bool + | list[bool] + | np.ndarray[Any, Any], + **constraints: Any, + ) -> DataFrame: + """ + Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + """ + def glimpse(self) -> str | None: + """ + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + """ + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + """ + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. + Using `describe` programmatically (versus interactive exploration) is + not recommended for this reason. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date, time + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [40, 50, None], + ... "bool": [True, False, True], + ... "str": ["zz", "xx", "yy"], + ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], + ... } + ... ) + + Show default frame statistics: + + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> with pl.Config(tbl_rows=12): + ... df.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + """ + def get_column_index(self, name: str) -> int: + """ + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + """ + def replace_column(self, index: int, column: Series) -> Self: + """ + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + """ + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + """ + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + multithreaded + Sort using multiple threads. + maintain_order + Whether the order should be maintained if elements are equal. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + """ + def sql(self, query: str) -> Self: + """ + Execute a SQL query against the DataFrame. + + .. warning:: + This functionality is considered **unstable**, although it is close to + being considered stable. It may be changed at any point without it being + considered a breaking change. + + Parameters + ---------- + query + SQL query to execute. + table_name + Optionally provide an explicit name for the table that represents the + calling frame (the alias "self" will always be registered/available). + + Notes + ----- + * The calling frame is automatically registered as a table in the SQL context + under the name "self". All DataFrames and LazyFrames found in the current + set of global variables are also registered, using their variable name. + * More control over registration and execution behaviour is available by + using the :class:`SQLContext` object. + * The SQL query executes entirely in lazy mode before being collected and + returned as a DataFrame. + + See Also + -------- + SQLContext + + Examples + -------- + >>> from datetime import date + >>> df1 = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["zz", "yy", "xx"], + ... "c": [date(1999, 12, 31), date(2010, 10, 10), date(2077, 8, 8)], + ... } + ... ) + + Query the DataFrame using SQL: + + >>> df1.sql("SELECT c, b FROM self WHERE a > 1") + shape: (2, 2) + ┌────────────┬─────┐ + │ c ┆ b │ + │ --- ┆ --- │ + │ date ┆ str │ + ╞════════════╪═════╡ + │ 2010-10-10 ┆ yy │ + │ 2077-08-08 ┆ xx │ + └────────────┴─────┘ + + Join two DataFrames using SQL. + + >>> df2 = pl.DataFrame({"a": [3, 2, 1], "d": [125, -654, 888]}) + >>> df1.sql( + ... \'\'\' + ... SELECT self.*, d + ... FROM self + ... INNER JOIN df2 USING (a) + ... WHERE a > 1 AND EXTRACT(year FROM c) < 2050 + ... \'\'\' + ... ) + shape: (1, 4) + ┌─────┬─────┬────────────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ date ┆ i64 │ + ╞═════╪═════╪════════════╪══════╡ + │ 2 ┆ yy ┆ 2010-10-10 ┆ -654 │ + └─────┴─────┴────────────┴──────┘ + + Apply transformations to a DataFrame using SQL, aliasing "self" to "frame". + + >>> df1.sql( + ... query=\'\'\' + ... SELECT + ... a, + ... (a % 2 == 0) AS a_is_even, + ... CONCAT_WS(\':\', b, b) AS b_b, + ... EXTRACT(year FROM c) AS year, + ... 0::float4 AS "zero", + ... FROM frame + ... \'\'\', + ... table_name="frame", + ... ) + shape: (3, 5) + ┌─────┬───────────┬───────┬──────┬──────┐ + │ a ┆ a_is_even ┆ b_b ┆ year ┆ zero │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ bool ┆ str ┆ i32 ┆ f32 │ + ╞═════╪═══════════╪═══════╪══════╪══════╡ + │ 1 ┆ false ┆ zz:zz ┆ 1999 ┆ 0.0 │ + │ 2 ┆ true ┆ yy:yy ┆ 2010 ┆ 0.0 │ + │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │ + └─────┴───────────┴───────┴──────┴──────┘ + """ + def top_k(self, k: int) -> DataFrame: + """ + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + """ + def bottom_k(self, k: int) -> DataFrame: + """ + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + """ + def equals(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + """ + def replace(self, column: str, new_column: Series) -> Self: + """ + Replace a column by a new Series. + + .. deprecated:: 0.19.0 + Use :meth:`with_columns` instead, e.g. + `df = df.with_columns(new_column.alias(column_name))`. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + """ + def slice(self, offset: int, length: int | None = ...) -> Self: + """ + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + """ + def head(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + """ + def tail(self, n: int = ...) -> Self: + """ + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + """ + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + """ + def drop_nulls( + self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ... + ) -> DataFrame: + """ + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + """ + def pipe( + self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs + ) -> T: + """ + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + """ + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + """ + Add a row index as the first column in the DataFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_index() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> df.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + """ + Add a column at index 0 that counts the rows. + + .. deprecated:: 0.20.4 + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + """ + def group_by(self, *by: IntoExpr | Iterable[IntoExpr], **named_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + Parameters + ---------- + *by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + **named_by + Additional columns to group by, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + """ + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a temporal or integer column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `group_by` is + specified, then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily + cast to Int64, so if performance matters use an Int64 column. + period + Length of the window - must be non-negative. + offset + Offset of the window. Default is `-period`. + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + group_by + Also group by this column/these columns + check_sorted + Check whether `index_column` is sorted (or, if `group_by` is given, + check whether it\'s sorted within each group). + When the `group_by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `group_by` + columns are passed, it will only be sorted within each group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + """ + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, `every`, and the earliest + datapoint. See the `start_by` argument description for details. + + .. warning:: + The index column must be sorted in ascending order. If `group_by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `group_by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, does not take effect if `start_by` is \'datapoint\'. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + group_by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + + The resulting window is then shifted back until the earliest datapoint + is in or in front of it. + check_sorted + Check whether `index_column` is sorted (or, if `group_by` is given, + check whether it\'s sorted within each group). + When the `group_by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `group_by` columns are + passed, it will only be sorted within each group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... group_by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + """ + def upsample(self, time_column: str) -> Self: + """ + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + Time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + Interval will start \'every\' duration. + offset + Change the start of the date_range by this offset. + + .. deprecated:: 0.20.19 + This argument is deprecated and will be removed in the next breaking + release. Instead, chain `upsample` with `dt.offset_by`. + group_by + First group by these columns and then upsample for every group. + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `group_by` columns + are passed, it will only be sorted within each group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", group_by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + """ + def join_asof(self, other: DataFrame) -> DataFrame: + """ + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import date + >>> gdp = pl.DataFrame( + ... { + ... "date": pl.date_range( + ... date(2016, 1, 1), + ... date(2020, 1, 1), + ... "1y", + ... eager=True, + ... ), + ... "gdp": [4164, 4411, 4566, 4696, 4827], + ... } + ... ) + >>> gdp + shape: (5, 2) + ┌────────────┬──────┐ + │ date ┆ gdp │ + │ --- ┆ --- │ + │ date ┆ i64 │ + ╞════════════╪══════╡ + │ 2016-01-01 ┆ 4164 │ + │ 2017-01-01 ┆ 4411 │ + │ 2018-01-01 ┆ 4566 │ + │ 2019-01-01 ┆ 4696 │ + │ 2020-01-01 ┆ 4827 │ + └────────────┴──────┘ + + >>> population = pl.DataFrame( + ... { + ... "date": [date(2016, 3, 1), date(2018, 8, 1), date(2019, 1, 1)], + ... "population": [82.19, 82.66, 83.12], + ... } + ... ).sort("date") + >>> population + shape: (3, 2) + ┌────────────┬────────────┐ + │ date ┆ population │ + │ --- ┆ --- │ + │ date ┆ f64 │ + ╞════════════╪════════════╡ + │ 2016-03-01 ┆ 82.19 │ + │ 2018-08-01 ┆ 82.66 │ + │ 2019-01-01 ┆ 83.12 │ + └────────────┴────────────┘ + + Note how the dates don\'t quite match. If we join them using `join_asof` and + `strategy=\'backward\'`, then each date from `population` which doesn\'t have an + exact match is matched with the closest earlier date from `gdp`: + + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 4566 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2018-01-01` from `gdp`. + + If we instead use `strategy=\'forward\'`, then each date from `population` which + doesn\'t have an exact match is matched with the closest later date from `gdp`: + + >>> population.join_asof(gdp, on="date", strategy="forward") + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4411 │ + │ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2017-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`. + + Finally, `strategy=\'nearest\'` gives us a mix of the two results above, as each + date from `population` which doesn\'t have an exact match is matched with the + closest date from `gdp`, regardless of whether it\'s earlier or later: + + >>> population.join_asof(gdp, on="date", strategy="nearest") + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`. + + They `by` argument allows joining on another column first, before the asof join. + In this example we join by `country` first, then asof join by date, as above. + + >>> gdp_dates = pl.date_range( # fmt: skip + ... date(2016, 1, 1), date(2020, 1, 1), "1y", eager=True + ... ) + >>> gdp2 = pl.DataFrame( + ... { + ... "country": ["Germany"] * 5 + ["Netherlands"] * 5, + ... "date": pl.concat([gdp_dates, gdp_dates]), + ... "gdp": [4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909], + ... } + ... ).sort("country", "date") + >>> + >>> gdp2 + shape: (10, 3) + ┌─────────────┬────────────┬──────┐ + │ country ┆ date ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ i64 │ + ╞═════════════╪════════════╪══════╡ + │ Germany ┆ 2016-01-01 ┆ 4164 │ + │ Germany ┆ 2017-01-01 ┆ 4411 │ + │ Germany ┆ 2018-01-01 ┆ 4566 │ + │ Germany ┆ 2019-01-01 ┆ 4696 │ + │ Germany ┆ 2020-01-01 ┆ 4827 │ + │ Netherlands ┆ 2016-01-01 ┆ 784 │ + │ Netherlands ┆ 2017-01-01 ┆ 833 │ + │ Netherlands ┆ 2018-01-01 ┆ 914 │ + │ Netherlands ┆ 2019-01-01 ┆ 910 │ + │ Netherlands ┆ 2020-01-01 ┆ 909 │ + └─────────────┴────────────┴──────┘ + >>> pop2 = pl.DataFrame( + ... { + ... "country": ["Germany"] * 3 + ["Netherlands"] * 3, + ... "date": [ + ... date(2016, 3, 1), + ... date(2018, 8, 1), + ... date(2019, 1, 1), + ... date(2016, 3, 1), + ... date(2018, 8, 1), + ... date(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12, 17.11, 17.32, 17.40], + ... } + ... ).sort("country", "date") + >>> + >>> pop2 + shape: (6, 3) + ┌─────────────┬────────────┬────────────┐ + │ country ┆ date ┆ population │ + │ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ f64 │ + ╞═════════════╪════════════╪════════════╡ + │ Germany ┆ 2016-03-01 ┆ 82.19 │ + │ Germany ┆ 2018-08-01 ┆ 82.66 │ + │ Germany ┆ 2019-01-01 ┆ 83.12 │ + │ Netherlands ┆ 2016-03-01 ┆ 17.11 │ + │ Netherlands ┆ 2018-08-01 ┆ 17.32 │ + │ Netherlands ┆ 2019-01-01 ┆ 17.4 │ + └─────────────┴────────────┴────────────┘ + >>> pop2.join_asof(gdp2, by="country", on="date", strategy="nearest") + shape: (6, 4) + ┌─────────────┬────────────┬────────────┬──────┐ + │ country ┆ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ f64 ┆ i64 │ + ╞═════════════╪════════════╪════════════╪══════╡ + │ Germany ┆ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ Germany ┆ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ Germany ┆ 2019-01-01 ┆ 83.12 ┆ 4696 │ + │ Netherlands ┆ 2016-03-01 ┆ 17.11 ┆ 784 │ + │ Netherlands ┆ 2018-08-01 ┆ 17.32 ┆ 910 │ + │ Netherlands ┆ 2019-01-01 ┆ 17.4 ┆ 910 │ + └─────────────┴────────────┴────────────┴──────┘ + + """ + def join( + self, + other: DataFrame, + on: str | Expr | Sequence[str | Expr] | None = ..., + how: JoinStrategy = ..., + ) -> DataFrame: + """ + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the Cartesian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see :class:`polars.StringCache`. + """ + def map_rows( + self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ... + ) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + """ + def hstack(self, columns: list[Series] | DataFrame) -> Self: + """ + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + """ + def vstack(self, other: DataFrame) -> Self: + """ + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + """ + def extend(self, other: DataFrame) -> Self: + """ + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + """ + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> DataFrame: + """ + Remove columns from the dataframe. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + """ + def drop_in_place(self, name: str) -> Series: + """ + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + """ + def cast( + self, + dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] | PolarsDataType, + ) -> DataFrame: + """ + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns matching one dtype (or dtype group) to another dtype: + + >>> df.cast({pl.Date: pl.Datetime}) + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + """ + def clear(self, n: int = ...) -> Self: + """ + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + """ + def clone(self) -> Self: + """ + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + """ + def get_columns(self) -> list[Series]: + """ + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + """ + def get_column(self, name: str) -> Series: + """ + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + """ + def fill_null( + self, + value: Any | None = ..., + strategy: FillNullStrategy | None = ..., + limit: int | None = ..., + ) -> DataFrame: + """ + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + """ + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + """ + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + """ + def explode( + self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr + ) -> DataFrame: + """ + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of the `List` or `Array` data type. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + """ + def pivot(self) -> Self: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. If None, all remaining columns + will be used. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'min\', \'max\', \'first\', \'last\', \'sum\', \'mean\', \'median\', \'len\'} + - An expression to do the aggregation. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(index="foo", columns="bar", values="baz", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... index=cs.string(), + ... columns=cs.string(), + ... values=cs.numeric(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬─────────────┬─────────────┬─────────────┬─────────────┐ + │ foo ┆ bar ┆ {"one","x"} ┆ {"one","y"} ┆ {"two","x"} ┆ {"two","y"} │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════════════╪═════════════╪═════════════╪═════════════╡ + │ one ┆ x ┆ 5 ┆ null ┆ null ┆ null │ + │ one ┆ y ┆ null ┆ 3 ┆ null ┆ null │ + │ two ┆ x ┆ null ┆ null ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ null ┆ null ┆ 3 │ + └─────┴─────┴─────────────┴─────────────┴─────────────┴─────────────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> df.lazy().group_by(index).agg( + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ).collect() # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + """ + def melt( + self, + id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., + value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., + variable_name: str | None = ..., + value_name: str | None = ..., + ) -> Self: + """ + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + """ + def unstack( + self, + step: int, + how: UnstackDirection = ..., + columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., + fill_values: list[Any] | None = ..., + ) -> DataFrame: + """ + Unstack a long table to a wide form without doing an aggregation. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + """ + def partition_by( + self, + by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], + *more_by: ColumnNameOrSelector, + ) -> list[Self] | dict[Any, Self]: + """ + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are tuples of + the distinct group values that identify each group. If a single string + was passed to `by`, the keys are a single value instead of a tuple. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {(\'a\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + (\'b\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + (\'c\',): shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + """ + def shift(self, n: int = ...) -> DataFrame: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + """ + def is_duplicated(self) -> Series: + """ + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + """ + def is_unique(self) -> Series: + """ + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + """ + def lazy(self) -> LazyFrame: + """ + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + """ + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + """ + def select_seq( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> DataFrame: + """ + Select columns from this DataFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def with_columns( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬─────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 │ + ╞═════╪══════╪═══════╪═════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 │ + │ 2 ┆ 4.0 ┆ true ┆ 4 │ + │ 3 ┆ 10.0 ┆ false ┆ 9 │ + │ 4 ┆ 13.0 ┆ true ┆ 16 │ + └─────┴──────┴───────┴─────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴─────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴─────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + """ + def with_columns_seq( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + See Also + -------- + with_columns + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + """ + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + """ + def max(self, axis: int | None = ...) -> Self | Series: + """ + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + """ + def max_horizontal(self) -> Series: + """ + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + """ + def min(self, axis: int | None = ...) -> Self | Series: + """ + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + """ + def min_horizontal(self) -> Series: + """ + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def sum(self) -> Self | Series: + """ + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + """ + def sum_horizontal(self) -> Series: + """ + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + """ + def mean(self) -> Self | Series: + """ + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + """ + def mean_horizontal(self) -> Series: + """ + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + """ + def std(self, ddof: int = ...) -> Self: + """ + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + """ + def var(self, ddof: int = ...) -> Self: + """ + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + """ + def median(self) -> Self: + """ + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + """ + def product(self) -> DataFrame: + """ + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + """ + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + """ + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + """ + def to_dummies( + self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ... + ) -> Self: + """ + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + """ + def unique( + self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ... + ) -> DataFrame: + """ + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + """ + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + """ + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + """ + def approx_n_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.20.11 + Use `select(pl.all().approx_n_unique())` instead. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() # doctest: +SKIP + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + """ + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + """ + def sample(self, n: int | Series | None = ...) -> Self: + """ + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + """ + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + """ + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + String = String + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + """ + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + """ + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + """ + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + """ + Returns all data in the DataFrame as a list of rows of python-native values. + + By default, each row is returned as a tuple of values given in the same order + as the frame columns. Setting `named=True` will return rows of dictionaries + instead. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + You should also consider using `iter_rows` instead, to avoid materialising all + the data at once; there is little performance difference between the two, but + peak memory can be reduced if processing rows in batches. + + Returns + ------- + list of row value tuples (default), or list of dictionaries (if `named=True`). + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + """ + def rows_by_key( + self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] + ) -> dict[Any, Iterable[Any]]: + """ + Returns all data as a dictionary of python-native values keyed by some column. + + This method is like `rows`, but instead of returning rows in a flat list, rows + are grouped by the values in the `key` column(s) and returned as a dictionary. + + Note that this method should not be used in place of native operations, due to + the high cost of materializing all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialize all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialize all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + """ + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + """ + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + """ + def iter_columns(self) -> Iterator[Series]: + """ + Returns an iterator over the columns of this DataFrame. + + Yields + ------ + Series + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + """ + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + """ + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + """ + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + """ + def gather_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + >>> s.gather_every(2, offset=1) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + """ + def hash_rows( + self, + seed: int = ..., + seed_1: int | None = ..., + seed_2: int | None = ..., + seed_3: int | None = ..., + ) -> Series: + """ + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + """ + def interpolate(self) -> DataFrame: + """ + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + """ + def is_empty(self) -> bool: + """ + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + """ + def to_struct(self, name: str = ...) -> Series: + """ + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + """ + def unnest( + self, + columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], + *more_columns: ColumnNameOrSelector, + ) -> Self: + """ + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + """ + def corr(self, **kwargs: Any) -> DataFrame: + """ + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + """ + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update( + self, + other: DataFrame, + on: str | Sequence[str] | None = ..., + how: Literal["left", "inner", "outer"] = ..., + ) -> DataFrame: + """ + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + """ + def count(self) -> DataFrame: + """ + Return the number of non-null elements for each column. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> df.count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + """ + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + Check whether the `index` column is sorted (or, if `by` is given, + check whether it's sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + Check whether `index_column` is sorted (or, if `by` is given, + check whether it's sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + + The resulting window is then shifted back until the earliest datapoint + is in or in front of it. + check_sorted + Check whether `index_column` is sorted (or, if `by` is given, + check whether it\'s sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def apply( + self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ... + ) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def plot(self): ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... + +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/expr/expr.pyi new file mode 100644 index 0000000..4a47fe4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/expr/expr.pyi @@ -0,0 +1,8772 @@ +#: version 0.20.23 +import P +import np as np +import pl +from polars.polars import PyExpr +from datetime import timedelta +from pathlib import Path +from polars._utils.convert import ( + negate_duration_string as negate_duration_string, + parse_as_duration_string as parse_as_duration_string, +) +from polars._utils.deprecation import ( + deprecate_function as deprecate_function, + deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, + deprecate_renamed_function as deprecate_renamed_function, + deprecate_renamed_parameter as deprecate_renamed_parameter, + deprecate_saturating as deprecate_saturating, + issue_deprecation_warning as issue_deprecation_warning, +) +from polars._utils.parse_expr_input import ( + parse_as_expression as parse_as_expression, + parse_as_list_of_expressions as parse_as_list_of_expressions, + parse_predicates_constraints_as_expression as parse_predicates_constraints_as_expression, +) +from polars._utils.unstable import ( + issue_unstable_warning as issue_unstable_warning, + unstable as unstable, +) +from polars._utils.various import ( + find_stacklevel as find_stacklevel, + no_default as no_default, + normalize_filepath as normalize_filepath, + sphinx_accessor as sphinx_accessor, + warn_null_comparison as warn_null_comparison, +) +from polars.datatypes.classes import Int64 as Int64 +from polars.datatypes.convert import ( + is_polars_dtype as is_polars_dtype, + py_type_to_dtype as py_type_to_dtype, +) +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import ( + CustomUFuncWarning as CustomUFuncWarning, + PolarsInefficientMapWarning as PolarsInefficientMapWarning, +) +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.meta.thread_pool import thread_pool_size as thread_pool_size +from typing import ( + Any, + Callable, + ClassVar as _ClassVar, + Collection, + Iterable, + Mapping, + NoReturn, + Sequence, +) + +TYPE_CHECKING: bool +BUILDING_SPHINX_DOCS: None +py_arg_where: builtin_function_or_method + +class Expr: + class _map_batches_wrapper: + def __init__( + self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None + ) -> None: ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... + + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: IntoExpr) -> Self: ... + def __radd__(self, other: IntoExpr) -> Self: ... + def __and__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rand__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __eq__(self, other: IntoExpr) -> Self: ... + def __floordiv__(self, other: IntoExpr) -> Self: ... + def __rfloordiv__(self, other: IntoExpr) -> Self: ... + def __ge__(self, other: IntoExpr) -> Self: ... + def __gt__(self, other: IntoExpr) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: IntoExpr) -> Self: ... + def __lt__(self, other: IntoExpr) -> Self: ... + def __mod__(self, other: IntoExpr) -> Self: ... + def __rmod__(self, other: IntoExpr) -> Self: ... + def __mul__(self, other: IntoExpr) -> Self: ... + def __rmul__(self, other: IntoExpr) -> Self: ... + def __ne__(self, other: IntoExpr) -> Self: ... + def __neg__(self) -> Self: ... + def __or__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __ror__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, exponent: IntoExprColumn | int | float) -> Self: ... + def __rpow__(self, base: IntoExprColumn | int | float) -> Expr: ... + def __sub__(self, other: IntoExpr) -> Self: ... + def __rsub__(self, other: IntoExpr) -> Self: ... + def __truediv__(self, other: IntoExpr) -> Self: ... + def __rtruediv__(self, other: IntoExpr) -> Self: ... + def __xor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rxor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __array_ufunc__( + self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any + ) -> Self: + """Numpy universal functions.""" + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + """ + Read an expression from a JSON file. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + Warnings + -------- + This function uses :mod:`pickle` under some circumstances, and as + such inherits the security implications. Deserializing can execute + arbitrary code so it should only be attempted on trusted data. + pickle is only used when the logical plan contains python UDFs. + + See Also + -------- + Expr.meta.serialize + + Examples + -------- + >>> from io import StringIO + >>> expr = pl.col("foo").sum().over("bar") + >>> json = expr.meta.serialize() + >>> pl.Expr.deserialize(StringIO(json)) # doctest: +ELLIPSIS + + """ + def to_physical(self) -> Self: + """ + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + """ + def any(self) -> Self: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + """ + def all(self) -> Self: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + """ + def arg_true(self) -> Self: + """ + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + """ + def sqrt(self) -> Self: + """ + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + """ + def cbrt(self) -> Self: + """ + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + """ + def log10(self) -> Self: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + """ + def exp(self) -> Self: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + """ + def alias(self, name: str) -> Self: + """ + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + """ + def map_alias(self, function: Callable[[str], str]) -> Self: + """ + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + """ + def prefix(self, prefix: str) -> Self: + """ + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + """ + def suffix(self, suffix: str) -> Self: + """ + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + """ + def keep_name(self) -> Self: + """ + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + """ + def exclude( + self, + columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], + *more_columns: str | PolarsDataType, + ) -> Self: + """ + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + """ + def pipe( + self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs + ) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + """ + def not_(self) -> Self: + """ + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + """ + def is_null(self) -> Self: + """ + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + """ + def is_not_null(self) -> Self: + """ + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + """ + def is_finite(self) -> Self: + """ + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + """ + def is_infinite(self) -> Self: + """ + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + """ + def is_nan(self) -> Self: + """ + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + """ + def is_not_nan(self) -> Self: + """ + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + """ + def agg_groups(self) -> Self: + """ + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + """ + def count(self) -> Self: + """ + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + """ + def len(self) -> Self: + """ + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + """ + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + """ + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + """ + def append(self, other: IntoExpr) -> Self: + """ + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + """ + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + """ + def drop_nulls(self) -> Self: + """ + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + """ + def drop_nans(self) -> Self: + """ + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + """ + def cum_sum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + """ + def cum_prod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + """ + def cum_min(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + """ + def cum_max(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + """ + def cum_count(self) -> Self: + """ + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": ["x", "k", None, "d"]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌──────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═══════════╪═══════════════════╡ + │ x ┆ 1 ┆ 3 │ + │ k ┆ 2 ┆ 2 │ + │ null ┆ 2 ┆ 1 │ + │ d ┆ 3 ┆ 1 │ + └──────┴───────────┴───────────────────┘ + """ + def floor(self) -> Self: + """ + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + """ + def ceil(self) -> Self: + """ + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + """ + def round(self, decimals: int = ...) -> Self: + """ + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + """ + def round_sig_figs(self, digits: int) -> Self: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + """ + def dot(self, other: Expr | str) -> Self: + """ + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + """ + def mode(self) -> Self: + """ + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + """ + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + """ + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + """ + def sort(self) -> Self: + """ + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + """ + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + """ + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + """ + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + """ + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + """ + def arg_sort(self) -> Self: + """ + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + Expr.gather: Take values by index. + Expr.rank : Get the rank of each row. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + Use gather to apply the arg sort to other columns. + + >>> df.select(pl.col("b").gather(pl.col("a").arg_sort())) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + """ + def arg_max(self) -> Self: + """ + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + """ + def arg_min(self) -> Self: + """ + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + """ + def search_sorted( + self, element: IntoExpr | np.ndarray[Any, Any], side: SearchSortedSide = ... + ) -> Self: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + """ + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + """ + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + multithreaded + Sort using multiple threads. + maintain_order + Whether the order should be maintained if elements are equal. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + """ + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + """ + def get(self, index: int | Expr) -> Self: + """ + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + """ + def shift(self, n: int | IntoExprColumn = ...) -> Self: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + """ + def fill_null( + self, + value: Any | None = ..., + strategy: FillNullStrategy | None = ..., + limit: int | None = ..., + ) -> Self: + """ + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + """ + def fill_nan(self, value: int | float | Expr | None) -> Self: + """ + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + """ + def forward_fill(self, limit: int | None = ...) -> Self: + """ + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + """ + def backward_fill(self, limit: int | None = ...) -> Self: + """ + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + """ + def reverse(self) -> Self: + """ + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + """ + def std(self, ddof: int = ...) -> Self: + """ + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + """ + def var(self, ddof: int = ...) -> Self: + """ + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + """ + def max(self) -> Self: + """ + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + """ + def min(self) -> Self: + """ + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + """ + def nan_max(self) -> Self: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + """ + def nan_min(self) -> Self: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + """ + def sum(self) -> Self: + """ + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + """ + def mean(self) -> Self: + """ + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + """ + def median(self) -> Self: + """ + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + """ + def product(self) -> Self: + """ + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + """ + def n_unique(self) -> Self: + """ + Count unique values. + + Notes + ----- + `null` is considered to be a unique value for the purposes of this operation. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 1, 2, 2, 3], "y": [1, 1, 1, None, None]}) + >>> df.select( + ... x_unique=pl.col("x").n_unique(), + ... y_unique=pl.col("y").n_unique(), + ... ) + shape: (1, 2) + ┌──────────┬──────────┐ + │ x_unique ┆ y_unique │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞══════════╪══════════╡ + │ 3 ┆ 2 │ + └──────────┴──────────┘ + """ + def approx_n_unique(self) -> Self: + """ + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"n": [1, 1, 2]}) + >>> df.select(pl.col("n").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ n │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + >>> df = pl.DataFrame({"n": range(1000)}) + >>> df.select( + ... exact=pl.col("n").n_unique(), + ... approx=pl.col("n").approx_n_unique(), + ... ) # doctest: +SKIP + shape: (1, 2) + ┌───────┬────────┐ + │ exact ┆ approx │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═══════╪════════╡ + │ 1000 ┆ 1005 │ + └───────┴────────┘ + """ + def null_count(self) -> Self: + """ + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [10, None, 300], + ... "c": [350, 650, 850], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + """ + def arg_unique(self) -> Self: + """ + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + """ + def unique(self) -> Self: + """ + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + """ + def first(self) -> Self: + """ + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + """ + def last(self) -> Self: + """ + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + """ + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + """ + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Explodes the grouped data into new rows, similar to the results of + `group_by` + `agg` + `explode`. Sorting of the given groups is required + if the groups are not part of the window operation for the operation, + otherwise the result would not make sense. This operation changes the + number of rows. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Aggregate values from each group using `mapping_strategy="explode"`. + + >>> df.select( + ... pl.col("a").head(2).over("a", mapping_strategy="explode"), + ... pl.col("b").sort_by("b").head(2).over("a", mapping_strategy="explode"), + ... pl.col("c").sort_by("b").head(2).over("a", mapping_strategy="explode"), + ... ) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 3 │ + │ b ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + """ + def rolling(self, index_column: str) -> Self: + """ + Create rolling groups based on a temporal or integer column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily + cast to Int64, so if performance matters use an Int64 column. + period + Length of the window - must be non-negative. + offset + Offset of the window. Default is `-period`. + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + Whether to check that `index_column` is sorted. + If you are sure the data is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output. + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + """ + def is_unique(self) -> Self: + """ + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + """ + def is_first_distinct(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + """ + def is_last_distinct(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + """ + def is_duplicated(self) -> Self: + """ + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + """ + def peak_max(self) -> Self: + """ + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + """ + def peak_min(self) -> Self: + """ + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + """ + def quantile( + self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ... + ) -> Self: + """ + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + """ + def cut(self, breaks: Sequence[float]) -> Self: + """ + Bin continuous values into discrete categories. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + """ + def qcut(self, quantiles: Sequence[float] | int) -> Self: + """ + Bin continuous values into discrete categories based on their quantiles. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + """ + def rle(self) -> Self: + """ + Compress the column data using run-length encoding. + + Run-length encoding (RLE) encodes data by storing each *run* of identical values + as a single value and its length. + + Returns + ------- + Expr + Expression of data type `Struct` with fields `lengths` of data type `Int32` + and `values` of the original data type. + + See Also + -------- + rle_id + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1, None, 1, 3, 3]}) + >>> df.select(pl.col("a").rle()).unnest("a") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + """ + def rle_id(self) -> Self: + """ + Get a distinct integer ID for each run of identical values. + + The ID starts at 0 and increases by one each time the value of the column + changes. + + Returns + ------- + Expr + Expression of data type `UInt32`. + + See Also + -------- + rle + + Notes + ----- + This functionality is especially useful for defining a new group for every time + a column\'s value changes, rather than for every distinct value of that column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 1, 1, 1], + ... "b": ["x", "x", None, "y", "y"], + ... } + ... ) + >>> df.with_columns( + ... rle_id_a=pl.col("a").rle_id(), + ... rle_id_ab=pl.struct("a", "b").rle_id(), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────────┬───────────┐ + │ a ┆ b ┆ rle_id_a ┆ rle_id_ab │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪══════════╪═══════════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴──────────┴───────────┘ + """ + def filter( + self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any + ) -> Self: + """ + Filter the expression based on one or more predicate expressions. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + Filter expressions can also take constraints as keyword arguments. + + >>> import polars.selectors as cs + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], + ... }, + ... ) + >>> df.group_by("key").agg( + ... n_1=pl.col("n").filter(n=1).sum(), + ... n_2=pl.col("n").filter(n=2).sum(), + ... n_3=pl.col("n").filter(n=3).sum(), + ... ).sort(by="key") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ key ┆ n_1 ┆ n_2 ┆ n_3 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 4 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 9 │ + └─────┴─────┴─────┴─────┘ + """ + def where(self, predicate: Expr) -> Self: + """ + Filter a single column. + + .. deprecated:: 0.20.4 + Use :func:`filter` instead. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( # doctest: +SKIP + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + """ + def map_batches( + self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ... + ) -> Self: + """ + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series (or a NumPy array, in which + case it will be automatically converted into a Series). If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be inferred based on the first non-null value + that is returned by the function. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! + agg_list + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ + """ + def map_elements( + self, + function: Callable[[Series], Series] | Callable[[Any], Any], + return_dtype: PolarsDataType | None = ..., + ) -> Self: + """ + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be inferred based on the first non-null value + that is returned by the function. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + The threading strategy to use. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a") + ... .map_elements(lambda x: x * 2, return_dtype=pl.Int64) + ... .alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum(), return_dtype=pl.Int64)) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val") + ... .map_elements(lambda s: s * len(s), return_dtype=pl.List(pl.Int64)) + ... .over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort("key") # doctest: +IGNORE_RESULT + """ + def flatten(self) -> Self: + """ + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + """ + def explode(self) -> Self: + """ + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + """ + def implode(self) -> Self: + """ + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + """ + def gather_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + >>> df.select(pl.col("foo").gather_every(3, offset=1)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 5 │ + │ 8 │ + └─────┘ + """ + def head(self, n: int | Expr = ...) -> Self: + """ + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + """ + def tail(self, n: int | Expr = ...) -> Self: + """ + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + """ + def limit(self, n: int | Expr = ...) -> Self: + """ + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + """ + def and_(self, *others: Any) -> Self: + """ + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + """ + def or_(self, *others: Any) -> Self: + """ + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + """ + def eq(self, other: Any) -> Self: + """ + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + """ + def eq_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + """ + def ge(self, other: Any) -> Self: + """ + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + """ + def gt(self, other: Any) -> Self: + """ + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + """ + def le(self, other: Any) -> Self: + """ + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + """ + def lt(self, other: Any) -> Self: + """ + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + """ + def ne(self, other: Any) -> Self: + """ + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + """ + def ne_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + """ + def add(self, other: Any) -> Self: + """ + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + """ + def floordiv(self, other: Any) -> Self: + """ + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + """ + def mod(self, other: Any) -> Self: + """ + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + """ + def mul(self, other: Any) -> Self: + """ + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + """ + def sub(self, other: Any) -> Self: + """ + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + """ + def neg(self) -> Self: + """ + Method equivalent of unary minus operator `-expr`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 2, None]}) + >>> df.with_columns(pl.col("a").neg()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 0 │ + │ -2 │ + │ null │ + └──────┘ + """ + def truediv(self, other: Any) -> Self: + """ + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + """ + def pow(self, exponent: IntoExprColumn | int | float) -> Self: + """ + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬──────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪══════╪════════════╡ + │ 1 ┆ 1 ┆ 1.0 │ + │ 2 ┆ 8 ┆ 2.0 │ + │ 4 ┆ 64 ┆ 16.0 │ + │ 8 ┆ 512 ┆ 512.0 │ + └─────┴──────┴────────────┘ + """ + def xor(self, other: Any) -> Self: + """ + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x") + ... .map_elements(binary_string, return_dtype=pl.String) + ... .alias("bin_x"), + ... pl.col("y") + ... .map_elements(binary_string, return_dtype=pl.String) + ... .alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string, return_dtype=pl.String) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + """ + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + """ + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + """ + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + """ + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + """ + def is_between( + self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ... + ) -> Self: + """ + Check if this expression is between the given lower and upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Notes + ----- + If the value of the `lower_bound` is greater than that of the `upper_bound` + then the result will be False, as no value can satisfy the condition. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + Use column expressions as lower/upper bounds, comparing to a literal value: + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1]}) + >>> df.with_columns( + ... pl.lit(3).is_between(pl.col("a"), pl.col("b")).alias("between_ab") + ... ) + shape: (5, 3) + ┌─────┬─────┬────────────┐ + │ a ┆ b ┆ between_ab │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ bool │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 5 ┆ true │ + │ 2 ┆ 4 ┆ true │ + │ 3 ┆ 3 ┆ true │ + │ 4 ┆ 2 ┆ false │ + │ 5 ┆ 1 ┆ false │ + └─────┴─────┴────────────┘ + """ + def hash( + self, + seed: int = ..., + seed_1: int | None = ..., + seed_2: int | None = ..., + seed_3: int | None = ..., + ) -> Self: + """ + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + """ + def reinterpret(self) -> Self: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + """ + def inspect(self, fmt: str = ...) -> Self: + """ + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + """ + def interpolate(self, method: InterpolationMethod = ...) -> Self: + """ + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ 5 ┆ 10.0 │ + │ 6 ┆ 12.0 │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + """ + def rolling_min( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Apply a rolling min (moving min) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("index").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └───────┴─────────────────────┴─────────────────┘ + """ + def rolling_max( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Apply a rolling max (moving max) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + """ + def rolling_mean( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Apply a rolling mean (moving mean) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └───────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ + """ + def rolling_sum( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Apply a rolling sum (moving sum) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 37 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 9 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 57 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ + """ + def rolling_std( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Compute a rolling standard deviation. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ null │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.707107 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + """ + def rolling_var( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Compute a rolling variance. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ null │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + """ + def rolling_median( + self, + window_size: int | timedelta | str, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Compute a rolling median. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + """ + def rolling_quantile( + self, + quantile: float, + interpolation: RollingInterpolationMethod = ..., + window_size: int | timedelta | str = ..., + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Compute a rolling quantile. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set (in which case, it defaults to `\'right\'`). + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + """ + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + """ + def rolling_map( + self, + function: Callable[[Series], Any], + window_size: int, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Compute a custom rolling window function. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + """ + def abs(self) -> Self: + """ + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + """ + def rank(self, method: RankMethod = ...) -> Self: + """ + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + """ + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + """ + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + """ + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + """ + def skew(self) -> Self: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + """ + def kurtosis(self) -> Self: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + """ + def clip( + self, + lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., + upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., + ) -> Self: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + """ + def lower_bound(self) -> Self: + """ + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + """ + def upper_bound(self) -> Self: + """ + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + """ + def sign(self) -> Self: + """ + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + """ + def sin(self) -> Self: + """ + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + """ + def cos(self) -> Self: + """ + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + """ + def tan(self) -> Self: + """ + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + """ + def cot(self) -> Self: + """ + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + """ + def arcsin(self) -> Self: + """ + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + """ + def arccos(self) -> Self: + """ + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + """ + def arctan(self) -> Self: + """ + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + """ + def sinh(self) -> Self: + """ + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + """ + def cosh(self) -> Self: + """ + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + """ + def tanh(self) -> Self: + """ + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + """ + def arcsinh(self) -> Self: + """ + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + """ + def arccosh(self) -> Self: + """ + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + """ + def arctanh(self) -> Self: + """ + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + """ + def degrees(self) -> Self: + """ + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + """ + def radians(self) -> Self: + """ + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + """ + def reshape(self, dimensions: tuple[int, ...]) -> Self: + """ + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + """ + def shuffle(self, seed: int | None = ...) -> Self: + """ + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + """ + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + """ + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + """ + def ewm_mean(self) -> Self: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1, ignore_nulls=False)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + """ + def ewm_mean_by(self, by: str | IntoExpr) -> Self: + """ + Calculate time-based exponentially weighted moving average. + + Given observations :math:`x_1, x_2, \\ldots, x_n` at times + :math:`t_1, t_2, \\ldots, t_n`, the EWMA is calculated as + + .. math:: + + y_0 &= x_0 + + \\alpha_i &= \\exp(-\\lambda(t_i - t_{i-1})) + + y_i &= \\alpha_i x_i + (1 - \\alpha_i) y_{i-1}; \\quad i > 0 + + where :math:`\\lambda` equals :math:`\\ln(2) / \\text{half_life}`. + + Parameters + ---------- + by + Times to calculate average by. Should be ``DateTime``, ``Date``, ``UInt64``, + ``UInt32``, ``Int64``, or ``Int32`` data type. + half_life + Unit over which observation decays to half its value. + + Can be created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Note that `half_life` is treated as a constant duration - calendar + durations such as months (or even days in the time-zone-aware case) + are not supported, please express your duration in an approximately + equivalent number of hours (e.g. \'370h\' instead of \'1mo\'). + check_sorted + Check whether `by` column is sorted. + Incorrectly setting this to `False` will lead to incorrect output. + + Returns + ------- + Expr + Float32 if input is Float32, otherwise Float64. + + Examples + -------- + >>> from datetime import date, timedelta + >>> df = pl.DataFrame( + ... { + ... "values": [0, 1, 2, None, 4], + ... "times": [ + ... date(2020, 1, 1), + ... date(2020, 1, 3), + ... date(2020, 1, 10), + ... date(2020, 1, 15), + ... date(2020, 1, 17), + ... ], + ... } + ... ).sort("times") + >>> df.with_columns( + ... result=pl.col("values").ewm_mean_by("times", half_life="4d"), + ... ) + shape: (5, 3) + ┌────────┬────────────┬──────────┐ + │ values ┆ times ┆ result │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ date ┆ f64 │ + ╞════════╪════════════╪══════════╡ + │ 0 ┆ 2020-01-01 ┆ 0.0 │ + │ 1 ┆ 2020-01-03 ┆ 0.292893 │ + │ 2 ┆ 2020-01-10 ┆ 1.492474 │ + │ null ┆ 2020-01-15 ┆ null │ + │ 4 ┆ 2020-01-17 ┆ 3.254508 │ + └────────┴────────────┴──────────┘ + """ + def ewm_std(self) -> Self: + """ + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1, ignore_nulls=False)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + """ + def ewm_var(self) -> Self: + """ + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1, ignore_nulls=False)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + """ + def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Self: + """ + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value or a unit expressioin with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + """ + def value_counts(self) -> Self: + """ + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + """ + def unique_counts(self) -> Self: + """ + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + """ + def log(self, base: float = ...) -> Self: + """ + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + """ + def log1p(self) -> Self: + """ + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + """ + def entropy(self, base: float = ...) -> Self: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + """ + Run an expression over a sliding window that increases `1` slot every iteration. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ -3 │ + │ -8 │ + │ -15 │ + │ -24 │ + └────────┘ + """ + def set_sorted(self) -> Self: + """ + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + """ + def shrink_dtype(self) -> Self: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + """ + def hist(self, bins: IntoExpr | None = ...) -> Self: + """ + Bin values into buckets and count their occurrences. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + """ + def replace( + self, + old: IntoExpr | Sequence[Any] | Mapping[Any, Any], + new: IntoExpr | Sequence[Any] | NoDefault = ..., + ) -> Self: + """ + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(old=Series(mapping.keys()), new=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + """ + def map( + self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ... + ) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + """ + def apply( + self, + function: Callable[[Series], Series] | Callable[[Any], Any], + return_dtype: PolarsDataType | None = ..., + ) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + """ + def rolling_apply( + self, + function: Callable[[Series], Any], + window_size: int, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def register_plugin(self) -> Expr: + """ + Register a plugin function. + + .. deprecated:: 0.20.16 + Use :func:`polars.plugins.register_plugin_function` instead. + + See the `user guide `_ + for more information about plugins. + + Warnings + -------- + This method is deprecated. Use the new `polars.plugins.register_plugin_function` + function instead. + + This is highly unsafe as this will call the C function loaded by + `lib::symbol`. + + The parameters you set dictate how Polars will handle the function. + Make sure they are correct! + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + """ + def _register_plugin(self) -> Expr: ... + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + .. deprecated:: 0.20.11 + This method has been renamed to :meth:`deserialize`. + Note that the new method operates on file-like inputs rather than strings. + Enclose your input in `io.StringIO` to keep the same behavior. + + Parameters + ---------- + value + JSON encoded string value + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +def _prepare_alpha( + com: float | int | None = ..., + span: float | int | None = ..., + half_life: float | int | None = ..., + alpha: float | int | None = ..., +) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" + +def _prepare_rolling_window_args( + window_size: int | timedelta | str, min_periods: int | None = ... +) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..5814ecf --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/lazyframe/frame.pyi @@ -0,0 +1,4544 @@ +#: version 0.20.23 +import P +import np +import pa +from polars.polars import PyLazyFrame +from pathlib import Path +from polars._utils.async_ import ( + _AioDataFrameResult as _AioDataFrameResult, + _GeventDataFrameResult as _GeventDataFrameResult, +) +from polars._utils.convert import ( + negate_duration_string as negate_duration_string, + parse_as_duration_string as parse_as_duration_string, +) +from polars._utils.deprecation import ( + deprecate_function as deprecate_function, + deprecate_parameter_as_positional as deprecate_parameter_as_positional, + deprecate_renamed_function as deprecate_renamed_function, + deprecate_renamed_parameter as deprecate_renamed_parameter, + deprecate_saturating as deprecate_saturating, + issue_deprecation_warning as issue_deprecation_warning, +) +from polars._utils.parse_expr_input import ( + parse_as_expression as parse_as_expression, + parse_as_list_of_expressions as parse_as_list_of_expressions, +) +from polars._utils.unstable import ( + issue_unstable_warning as issue_unstable_warning, + unstable as unstable, +) +from polars._utils.various import ( + _in_notebook as _in_notebook, + is_bool_sequence as is_bool_sequence, + is_sequence as is_sequence, + normalize_filepath as normalize_filepath, + parse_percentiles as parse_percentiles, +) +from polars._utils.wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.convert import from_dict as from_dict +from polars.datatypes.classes import ( + Boolean as Boolean, + Categorical as Categorical, + DataTypeGroup as DataTypeGroup, + Date as Date, + Datetime as Datetime, + Duration as Duration, + Enum as Enum, + Float32 as Float32, + Float64 as Float64, + Int16 as Int16, + Int32 as Int32, + Int64 as Int64, + Int8 as Int8, + Null as Null, + Object as Object, + String as String, + Time as Time, + UInt16 as UInt16, + UInt32 as UInt32, + UInt64 as UInt64, + UInt8 as UInt8, + Unknown as Unknown, +) +from polars.datatypes.convert import ( + is_polars_dtype as is_polars_dtype, + py_type_to_dtype as py_type_to_dtype, +) +from polars.dependencies import import_optional as import_optional +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.lazyframe.in_process import InProcessQuery as InProcessQuery +from polars.selectors import ( + _expand_selectors as _expand_selectors, + by_dtype as by_dtype, + expand_selector as expand_selector, +) +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from typing import ( + Any, + Callable, + ClassVar as _ClassVar, + Collection, + Iterable, + Mapping, + NoReturn, + Sequence, +) + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__( + self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ... + ) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_python_function( + cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any + ) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + Warnings + -------- + This function uses :mod:`pickle` under some circumstances, and as + such inherits the security implications. Deserializing can execute + arbitrary code so it should only be attempted on trusted data. + pickle is only used when the logical plan contains python UDFs. + + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"MapFunction":{"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"function":{"Stats":"Sum"}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + """ + def pipe( + self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs + ) -> T: + """ + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + """ + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + """ + Creates a summary of statistics for a LazyFrame, returning a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Returns + ------- + DataFrame + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + * This method does *not* maintain the laziness of the frame, and will `collect` + the final result. This could potentially be an expensive operation. + * We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. + Using `describe` programmatically (versus interactive exploration) is + not recommended for this reason. + + Examples + -------- + >>> from datetime import date, time + >>> lf = pl.LazyFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [40, 50, None], + ... "bool": [True, False, True], + ... "str": ["zz", "xx", "yy"], + ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], + ... } + ... ) + + Show default frame statistics: + + >>> lf.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> with pl.Config(tbl_rows=12): + ... lf.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + """ + def explain(self) -> str: + """ + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + tree_format + Format the output as a tree + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + """ + def show_graph(self) -> str | None: + """ + Show a plot of the query plan. + + Note that graphviz must be installed to render the visualization (if not + already present you can download it here: `_). + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + """ + def inspect(self, fmt: str = ...) -> Self: + """ + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and pass on + the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + """ + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + """ + Sort the LazyFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + multithreaded + Sort using multiple threads. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + """ + def sql(self, query: str) -> Self: + """ + Execute a SQL query against the LazyFrame. + + .. warning:: + This functionality is considered **unstable**, although it is close to + being considered stable. It may be changed at any point without it being + considered a breaking change. + + Parameters + ---------- + query + SQL query to execute. + table_name + Optionally provide an explicit name for the table that represents the + calling frame (the alias "self" will always be registered/available). + + Notes + ----- + * The calling frame is automatically registered as a table in the SQL context + under the name "self". All DataFrames and LazyFrames found in the current + set of global variables are also registered, using their variable name. + * More control over registration and execution behaviour is available by + using the :class:`SQLContext` object. + + See Also + -------- + SQLContext + + Examples + -------- + >>> lf1 = pl.LazyFrame({"a": [1, 2, 3], "b": [6, 7, 8], "c": ["z", "y", "x"]}) + >>> lf2 = pl.LazyFrame({"a": [3, 2, 1], "d": [125, -654, 888]}) + + Query the LazyFrame using SQL: + + >>> lf1.sql("SELECT c, b FROM self WHERE a > 1").collect() + shape: (2, 2) + ┌─────┬─────┐ + │ c ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ y ┆ 7 │ + │ x ┆ 8 │ + └─────┴─────┘ + + Join two LazyFrames: + + >>> lf1.sql( + ... \'\'\' + ... SELECT self.*, d + ... FROM self + ... INNER JOIN lf2 USING (a) + ... WHERE a > 1 AND b < 8 + ... \'\'\' + ... ).collect() + shape: (1, 4) + ┌─────┬─────┬─────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪══════╡ + │ 2 ┆ 7 ┆ y ┆ -654 │ + └─────┴─────┴─────┴──────┘ + + Apply SQL transforms (aliasing "self" to "frame") and subsequently + filter natively (you can freely mix SQL and native operations): + + >>> lf1.sql( + ... query=\'\'\' + ... SELECT + ... a, + ... (a % 2 == 0) AS a_is_even, + ... (b::float4 / 2) AS "b/2", + ... CONCAT_WS(\':\', c, c, c) AS c_c_c + ... FROM frame + ... ORDER BY a + ... \'\'\', + ... table_name="frame", + ... ).filter(~pl.col("c_c_c").str.starts_with("x")).collect() + shape: (2, 4) + ┌─────┬───────────┬─────┬───────┐ + │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ bool ┆ f32 ┆ str │ + ╞═════╪═══════════╪═════╪═══════╡ + │ 1 ┆ false ┆ 3.0 ┆ z:z:z │ + │ 2 ┆ true ┆ 3.5 ┆ y:y:y │ + └─────┴───────────┴─────┴───────┘ + """ + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + multithreaded + Sort using multiple threads. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + """ + def bottom_k(self, k: int) -> Self: + """ + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + multithreaded + Sort using multiple threads. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + """ + def profile(self) -> tuple[DataFrame, DataFrame]: + """ + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + """ + def collect(self) -> DataFrame | InProcessQuery: + """ + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + background + Run the query in the background and get a handle to the query. + This handle can be used to fetch the result or cancel the query. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + """ + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + """ + Collect DataFrame asynchronously in thread pool. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Collects into a DataFrame (like :func:`collect`) but, instead of returning + a DataFrame directly, it is scheduled to be collected inside a thread pool, + while this method returns almost instantly. + + This can be useful if you use `gevent` or `asyncio` and want to release + control to other greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + .. note:: + Use :func:`explain` to see if Polars can process the query in + streaming mode. + + Returns + ------- + If `gevent=False` (default) then returns an awaitable. + + If `gevent=True` then returns wrapper that has a + `.get(block=True, timeout=None)` method. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async : Collect multiple LazyFrames at the same time lazily. + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + """ + def sink_parquet(self, path: str | Path) -> None: + """ + Evaluate the query in streaming mode and write to a Parquet file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This is the default behavior. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + """ + def sink_ipc(self, path: str | Path) -> DataFrame: + """ + Evaluate the query in streaming mode and write to an IPC file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + """ + def sink_csv(self, path: str | Path) -> DataFrame: + """ + Evaluate the query in streaming mode and write to a CSV file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + """ + def sink_ndjson(self, path: str | Path) -> DataFrame: + """ + Evaluate the query in streaming mode and write to an NDJSON file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ndjson("out.ndjson") # doctest: +SKIP + """ + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + """ + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + """ + def lazy(self) -> Self: + """ + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + """ + def cache(self) -> Self: + """ + Cache the result once the execution of the physical plan hits this node. + + It is not recommended using this as the optimizer likely can do a better job. + """ + def cast( + self, + dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] | PolarsDataType, + ) -> Self: + """ + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns matching one dtype (or dtype group) to another dtype: + + >>> lf.cast({pl.Date: pl.Datetime}).collect() + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + """ + def clear(self, n: int = ...) -> LazyFrame: + """ + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().collect() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).collect() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + """ + def clone(self) -> Self: + """ + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + """ + def filter( + self, + *predicates: IntoExprColumn + | Iterable[IntoExprColumn] + | bool + | list[bool] + | np.ndarray[Any, Any], + **constraints: Any, + ) -> Self: + """ + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + """ + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + """ + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def group_by(self, *by: IntoExpr | Iterable[IntoExpr], **named_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + Parameters + ---------- + *by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + **named_by + Additional columns to group by, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + """ + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a temporal or integer column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `group_by` is + specified, then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily + cast to Int64, so if performance matters use an Int64 column. + period + Length of the window - must be non-negative. + offset + Offset of the window. Default is `-period`. + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + group_by + Also group by this column/these columns + check_sorted + Check whether `index_column` is sorted (or, if `group_by` is given, + check whether it\'s sorted within each group). + When the `group_by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `group_by` + columns are passed, it will only be sorted within each group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + """ + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, `every`, and the earliest + datapoint. See the `start_by` argument description for details. + + .. warning:: + The index column must be sorted in ascending order. If `group_by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `group_by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, does not take effect if `start_by` is \'datapoint\'. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + group_by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + + The resulting window is then shifted back until the earliest datapoint + is in or in front of it. + check_sorted + Check whether `index_column` is sorted (or, if `group_by` is given, + check whether it\'s sorted within each group). + When the `group_by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `group_by` columns are + passed, it will only be sorted within each group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... group_by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + """ + def join_asof(self, other: LazyFrame) -> Self: + """ + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + """ + def join( + self, + other: LazyFrame, + on: str | Expr | Sequence[str | Expr] | None = ..., + how: JoinStrategy = ..., + ) -> Self: + """ + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the Cartesian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬─────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 │ + ╞═════╪══════╪═══════╪═════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 │ + │ 2 ┆ 4.0 ┆ true ┆ 4 │ + │ 3 ┆ 10.0 ┆ false ┆ 9 │ + │ 4 ┆ 13.0 ┆ true ┆ 16 │ + └─────┴──────┴───────┴─────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴─────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴─────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + """ + def with_columns_seq( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + """ + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + """ + def with_context(self, other: Self | list[Self]) -> Self: + """ + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + """ + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> Self: + """ + Remove columns from the DataFrame. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + """ + def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> Self: + """ + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name, or a function + that takes the old name as input and returns the new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + >>> lf.rename(lambda column_name: "c" + column_name[1:]).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ coo ┆ car ┆ cam │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + """ + def reverse(self) -> Self: + """ + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + """ + def shift(self, n: int | IntoExprColumn = ...) -> Self: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + """ + def slice(self, offset: int, length: int | None = ...) -> Self: + """ + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + """ + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + """ + def head(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + """ + def tail(self, n: int = ...) -> Self: + """ + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + """ + def last(self) -> Self: + """ + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + """ + def first(self) -> Self: + """ + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + """ + def approx_n_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.20.11 + Use `select(pl.all().approx_n_unique())` instead. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() # doctest: +SKIP + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + """ + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + """ + Add a row index as the first column in the LazyFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Warnings + -------- + Using this function can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_index().collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> lf.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + """ + Add a column at index 0 that counts the rows. + + .. deprecated:: 0.20.4 + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + """ + def gather_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + >>> lf.gather_every(2, offset=1).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + """ + def fill_null( + self, + value: Any | None = ..., + strategy: FillNullStrategy | None = ..., + limit: int | None = ..., + ) -> Self: + """ + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + """ + def fill_nan(self, value: int | float | Expr | None) -> Self: + """ + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + """ + def std(self, ddof: int = ...) -> Self: + """ + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + """ + def var(self, ddof: int = ...) -> Self: + """ + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + """ + def max(self) -> Self: + """ + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + """ + def min(self) -> Self: + """ + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + """ + def sum(self) -> Self: + """ + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + """ + def mean(self) -> Self: + """ + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + """ + def median(self) -> Self: + """ + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + """ + def null_count(self) -> Self: + """ + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + """ + def quantile( + self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ... + ) -> Self: + """ + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + """ + def explode( + self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr + ) -> Self: + """ + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of the `List` or `Array` data type. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + """ + def unique( + self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ... + ) -> Self: + """ + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + """ + def drop_nulls( + self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ... + ) -> Self: + """ + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + """ + def melt( + self, + id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., + value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., + variable_name: str | None = ..., + value_name: str | None = ..., + ) -> Self: + """ + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + """ + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + """ + def interpolate(self) -> Self: + """ + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + """ + def unnest( + self, + columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], + *more_columns: ColumnNameOrSelector, + ) -> Self: + """ + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + """ + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update( + self, + other: LazyFrame, + on: str | Sequence[str] | None = ..., + how: Literal["left", "inner", "outer"] = ..., + ) -> Self: + """ + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + """ + def count(self) -> Self: + """ + Return the number of non-null elements for each column. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> lf.count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + """ + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + Check whether `index_column` is sorted (or, if `by` is given, + check whether it's sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + Check whether `index_column` is sorted (or, if `by` is given, + check whether it's sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, does not take effect if `start_by` is \'datapoint\'. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + + The resulting window is then shifted back until the earliest datapoint + is in or in front of it. + check_sorted + Check whether `index_column` is sorted (or, if `by` is given, + check whether it\'s sorted within each group). + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/series/series.pyi new file mode 100644 index 0000000..cae43a4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.23/polars/series/series.pyi @@ -0,0 +1,5593 @@ +#: version 0.20.23 +import np as np +import pa as pa +import pd as pd +from polars.polars import PySeries +from datetime import date, datetime, timedelta +from polars._utils.construction.other import numpy_to_idxs as numpy_to_idxs +from polars._utils.construction.series import ( + arrow_to_pyseries as arrow_to_pyseries, + dataframe_to_pyseries as dataframe_to_pyseries, + iterable_to_pyseries as iterable_to_pyseries, + numpy_to_pyseries as numpy_to_pyseries, + pandas_to_pyseries as pandas_to_pyseries, + sequence_to_pyseries as sequence_to_pyseries, + series_to_pyseries as series_to_pyseries, +) +from polars._utils.convert import ( + date_to_int as date_to_int, + datetime_to_int as datetime_to_int, + time_to_int as time_to_int, + timedelta_to_int as timedelta_to_int, +) +from polars._utils.deprecation import ( + deprecate_function as deprecate_function, + deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, + deprecate_renamed_function as deprecate_renamed_function, + deprecate_renamed_parameter as deprecate_renamed_parameter, + issue_deprecation_warning as issue_deprecation_warning, +) +from polars._utils.unstable import unstable as unstable +from polars._utils.various import ( + _is_generator as _is_generator, + no_default as no_default, + parse_version as parse_version, + range_to_slice as range_to_slice, + scale_bytes as scale_bytes, + sphinx_accessor as sphinx_accessor, + warn_null_comparison as warn_null_comparison, +) +from polars._utils.wrap import wrap_df as wrap_df +from polars.datatypes.classes import ( + Array as Array, + Boolean as Boolean, + Categorical as Categorical, + Date as Date, + Datetime as Datetime, + Decimal as Decimal, + Duration as Duration, + Enum as Enum, + Float64 as Float64, + Int16 as Int16, + Int32 as Int32, + Int64 as Int64, + Int8 as Int8, + List as List, + Null as Null, + Object as Object, + String as String, + Time as Time, + UInt32 as UInt32, + UInt64 as UInt64, + UInt8 as UInt8, + Unknown as Unknown, +) +from polars.datatypes.convert import ( + dtype_to_ctype as dtype_to_ctype, + is_polars_dtype as is_polars_dtype, + maybe_cast as maybe_cast, + numpy_char_code_to_dtype as numpy_char_code_to_dtype, + py_type_to_dtype as py_type_to_dtype, + supported_numpy_char_code as supported_numpy_char_code, +) +from polars.dependencies import ( + _check_for_numpy as _check_for_numpy, + _check_for_pandas as _check_for_pandas, + _check_for_pyarrow as _check_for_pyarrow, + hvplot as hvplot, +) +from polars.exceptions import ( + ModuleUpgradeRequired as ModuleUpgradeRequired, + ShapeError as ShapeError, +) +from polars.meta.index_type import get_index_type as get_index_type +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from typing import ( + Any, + ArrayLike, + Callable, + ClassVar as _ClassVar, + Collection, + Generator, + Iterable, + Mapping, + NoReturn, + Sequence, +) + +TYPE_CHECKING: bool +BUILDING_SPHINX_DOCS: None +_HVPLOT_AVAILABLE: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set[str]] = ... + def __init__( + self, + name: str | ArrayLike | None = ..., + values: ArrayLike | None = ..., + dtype: PolarsDataType | None = ..., + ) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _import_from_c(cls, name: str, pointers: list[tuple[int, int]]) -> Self: + """ + Construct a Series from Arrows C interface. + + Warning + ------- + This will read the `array` pointer without moving it. The host process should + garbage collect the heap pointer, but not its contents. + """ + def _get_buffer_info(self) -> BufferInfo: + """ + Return pointer, offset, and length information about the underlying buffer. + + Returns + ------- + tuple of ints + Tuple of the form (pointer, offset, length) + + Raises + ------ + TypeError + If the `Series` data type is not physical. + ComputeError + If the `Series` contains multiple chunks. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def _get_buffers(self) -> SeriesBuffers: + """ + Return the underlying values, validity, and offsets buffers as Series. + + The values buffer always exists. + The validity buffer may not exist if the column contains no null values. + The offsets buffer only exists for Series of data type `String` and `List`. + + Returns + ------- + dict + Dictionary with `"values"`, `"validity"`, and `"offsets"` keys mapping + to the corresponding buffer or `None` if the buffer doesn\'t exist. + + Warnings + -------- + The underlying buffers for `String` Series cannot be represented in this + format. Instead, the buffers are converted to a values and offsets buffer. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def _from_buffer(self, dtype: PolarsDataType, buffer_info: BufferInfo, owner: Any) -> Self: + """ + Construct a Series from information about its underlying buffer. + + Parameters + ---------- + dtype + The data type of the buffer. + Must be a physical type (integer, float, or boolean). + buffer_info + Tuple containing buffer information in the form `(pointer, offset, length)`. + owner + The object owning the buffer. + + Returns + ------- + Series + + Raises + ------ + TypeError + When the given `dtype` is not supported. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def _from_buffers( + self, dtype: PolarsDataType, data: Series | Sequence[Series], validity: Series | None = ... + ) -> Self: + """ + Construct a Series from information about its underlying buffers. + + Parameters + ---------- + dtype + The data type of the resulting Series. + data + Buffers describing the data. For most data types, this is a single Series of + the physical data type of `dtype`. Some data types require multiple buffers: + + - `String`: A data buffer of type `UInt8` and an offsets buffer + of type `Int64`. Note that this does not match how the data + is represented internally and data copy is required to construct + the Series. + validity + Validity buffer. If specified, must be a Series of data type `Boolean`. + + Returns + ------- + Series + + Raises + ------ + TypeError + When the given `dtype` is not supported or the other inputs do not match + the requirements for constructing a Series of the given `dtype`. + + Warnings + -------- + Constructing a `String` Series requires specifying a values and offsets buffer, + which does not match the actual underlying buffers. The values and offsets + buffer are converted into the actual buffers, which copies data. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Series | Expr: + """ + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + """ + def ne(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Series | Expr: + """ + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + """ + def ge(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__( + self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] + ) -> Any: ... + def __setitem__( + self, + key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], + value: Any, + ) -> None: ... + def __array__(self, dtype: Any | None = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + """ + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + """ + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + """ + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + """ + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.sqrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.414214 + 1.732051 + ] + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.cbrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.259921 + 1.44225 + ] + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `False` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + """ + def log(self, base: float = ...) -> Series: + """ + Compute the logarithm to a given base. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log() + shape: (3,) + Series: '' [f64] + [ + 0.0 + 0.693147 + 1.098612 + ] + """ + def log1p(self) -> Series: + """ + Compute the natural logarithm of the input array plus one, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log1p() + shape: (3,) + Series: '' [f64] + [ + 0.693147 + 1.098612 + 1.386294 + ] + """ + def log10(self) -> Series: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> s = pl.Series([10, 100, 1000]) + >>> s.log10() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def exp(self) -> Series: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.exp() + shape: (3,) + Series: '' [f64] + [ + 2.718282 + 7.389056 + 20.085537 + ] + """ + def drop_nulls(self) -> Series: + """ + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + """ + def drop_nans(self) -> Series: + """ + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + """ + def to_frame(self, name: str | None = ...) -> DataFrame: + """ + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + """ + def describe( + self, + percentiles: Sequence[float] | float | None = ..., + interpolation: RollingInterpolationMethod = ..., + ) -> DataFrame: + """ + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["aa", "aa", None, "bb", "cc"]) + >>> s.describe() + shape: (4, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ min ┆ aa │ + │ max ┆ cc │ + └────────────┴───────┘ + """ + def sum(self) -> int | float: + """ + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + """ + def mean(self) -> PythonLiteral | None: + """ + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + """ + def product(self) -> int | float: + """ + Reduce this Series to the product value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.product() + 6 + """ + def pow(self, exponent: int | float | Series) -> Series: + """ + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 8 + 27 + 64 + ] + """ + def min(self) -> PythonLiteral | None: + """ + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + """ + def max(self) -> PythonLiteral | None: + """ + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + """ + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_max() + 4 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_max() + nan + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_min() + 1 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_min() + nan + """ + def std(self, ddof: int = ...) -> float | timedelta | None: + """ + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + """ + def var(self, ddof: int = ...) -> float | timedelta | None: + """ + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + """ + def median(self) -> PythonLiteral | None: + """ + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + """ + def quantile( + self, quantile: float, interpolation: RollingInterpolationMethod = ... + ) -> float | None: + """ + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + """ + def to_dummies(self) -> DataFrame: + """ + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variable being encoded. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + >>> s.to_dummies(drop_first=True) + shape: (3, 2) + ┌─────┬─────┐ + │ a_2 ┆ a_3 │ + │ --- ┆ --- │ + │ u8 ┆ u8 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 0 │ + │ 0 ┆ 1 │ + └─────┴─────┘ + """ + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + """ + Bin continuous values into discrete categories. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + """ + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + """ + Bin continuous values into discrete categories based on their quantiles. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + """ + def rle(self) -> Series: + """ + Compress the Series data using run-length encoding. + + Run-length encoding (RLE) encodes data by storing each *run* of identical values + as a single value and its length. + + Returns + ------- + Series + Series of data type `Struct` with fields `lengths` of data type `Int32` + and `values` of the original data type. + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + """ + def rle_id(self) -> Series: + """ + Get a distinct integer ID for each run of identical values. + + The ID starts at 0 and increases by one each time the value of the column + changes. + + Returns + ------- + Series + Series of data type `UInt32`. + + See Also + -------- + rle + + Notes + ----- + This functionality is especially useful for defining a new group for every time + a column\'s value changes, rather than for every distinct value of that column. + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + """ + def hist(self, bins: list[float] | None = ...) -> DataFrame: + """ + Bin values into buckets and count their occurrences. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + """ + def value_counts(self) -> DataFrame: + """ + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + """ + def unique_counts(self) -> Series: + """ + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + """ + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + """ + Run an expression over a sliding window that increases `1` slot every iteration. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [i64] + [ + 0 + -3 + -8 + -15 + -24 + ] + """ + def alias(self, name: str) -> Series: + """ + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + """ + def rename(self, name: str) -> Series: + """ + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + """ + def chunk_lengths(self) -> list[int]: + """ + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + """ + def n_chunks(self) -> int: + """ + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + """ + def cum_max(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + """ + def cum_min(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + """ + def cum_prod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + """ + def cum_sum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + """ + def cum_count(self) -> Self: + """ + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> s = pl.Series(["x", "k", None, "d"]) + >>> s.cum_count() + shape: (4,) + Series: \'\' [u32] + [ + 1 + 2 + 2 + 3 + ] + """ + def slice(self, offset: int, length: int | None = ...) -> Series: + """ + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + """ + def append(self, other: Series) -> Self: + """ + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + """ + def extend(self, other: Series) -> Self: + """ + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + """ + def filter(self, predicate: Series | list[bool]) -> Self: + """ + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + """ + def head(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + """ + def tail(self, n: int = ...) -> Series: + """ + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + """ + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + """ + def gather_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Start the row index at this offset. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + >>> s.gather_every(2, offset=1) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + """ + def sort(self) -> Self: + """ + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + multithreaded + Sort using multiple threads. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + """ + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + """ + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + """ + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + """ + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + """ + def arg_sort(self) -> Series: + """ + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + See Also + -------- + Series.gather: Take values by index. + Series.rank : Get the rank of each row. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + """ + def arg_unique(self) -> Series: + """ + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + """ + def arg_min(self) -> int | None: + """ + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + """ + def arg_max(self) -> int | None: + """ + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + """ + def search_sorted( + self, element: IntoExpr | np.ndarray[Any, Any], side: SearchSortedSide = ... + ) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> s = pl.Series("set", [1, 2, 3, 4, 4, 5, 6, 7]) + >>> s.search_sorted(4) + 4 + >>> s.search_sorted(4, "left") + 3 + >>> s.search_sorted(4, "right") + 5 + >>> s.search_sorted([1, 4, 5]) + shape: (3,) + Series: \'set\' [u32] + [ + 0 + 4 + 5 + ] + >>> s.search_sorted([1, 4, 5], "left") + shape: (3,) + Series: \'set\' [u32] + [ + 0 + 3 + 5 + ] + >>> s.search_sorted([1, 4, 5], "right") + shape: (3,) + Series: \'set\' [u32] + [ + 1 + 5 + 6 + ] + """ + def unique(self) -> Series: + """ + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + """ + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + """ + def null_count(self) -> int: + """ + Count the null values in this Series. + + Examples + -------- + >>> s = pl.Series([1, None, None]) + >>> s.null_count() + 2 + """ + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + """ + def is_empty(self) -> bool: + """ + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + """ + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + Examples + -------- + >>> s = pl.Series([1, 3, 2]) + >>> s.is_sorted() + False + + >>> s = pl.Series([3, 2, 1]) + >>> s.is_sorted(descending=True) + True + """ + def not_(self) -> Series: + """ + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + """ + def is_null(self) -> Series: + """ + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + """ + def is_not_null(self) -> Series: + """ + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + """ + def is_finite(self) -> Series: + """ + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + """ + def is_infinite(self) -> Series: + """ + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + """ + def is_nan(self) -> Series: + """ + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + """ + def is_not_nan(self) -> Series: + """ + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + """ + def is_in(self, other: Series | Collection[Any]) -> Series: + """ + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + """ + def arg_true(self) -> Series: + """ + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + """ + def is_unique(self) -> Series: + """ + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + """ + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + def is_duplicated(self) -> Series: + """ + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + """ + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + """ + def equals(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + """ + def cast( + self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool] + ) -> Self: + """ + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + """ + def to_physical(self) -> Series: + """ + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + """ + def to_list(self) -> list[Any]: + """ + Convert this Series to a Python list. + + This operation copies data. + + Parameters + ---------- + use_pyarrow + Use PyArrow to perform the conversion. + + .. deprecated:: 0.19.9 + This parameter will be removed. The function can safely be called + without the parameter - it should give the exact same result. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + """ + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + """ + def reverse(self) -> Series: + """ + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + """ + def is_between( + self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ... + ) -> Series: + """ + Get a boolean mask of the values that are between the given lower/upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Notes + ----- + If the value of the `lower_bound` is greater than that of the `upper_bound` + then the result will be False, as no value can satisfy the condition. + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + """ + def to_numpy(self) -> np.ndarray[Any, Any]: + """ + Convert this Series to a NumPy ndarray. + + This operation may copy data, but is completely safe. Note that: + + - Data which is purely numeric AND without null values is not cloned + - Floating point `nan` values can be zero-copied + - Booleans cannot be zero-copied + + To ensure that no data is copied, set `allow_copy=False`. + + Parameters + ---------- + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + writable + Ensure the resulting array is writable. This will force a copy of the data + if the array was created without copy, as the underlying Arrow data is + immutable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + for the conversion to NumPy. + zero_copy_only + Raise an exception if the conversion to a NumPy would require copying + the underlying data. Data copy occurs, for example, when the Series contains + nulls or non-numeric types. + + .. deprecated:: 0.20.10 + Use the `allow_copy` parameter instead, which is the inverse of this + one. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + """ + def to_arrow(self) -> pa.Array: + """ + Return the underlying Arrow array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + """ + def to_pandas(self, **kwargs: Any) -> pd.Series[Any]: + """ + Convert this Series to a pandas Series. + + This operation copies data if `use_pyarrow_extension_array` is not enabled. + + Parameters + ---------- + use_pyarrow_extension_array + Use a PyArrow-backed extension array instead of a NumPy array for the pandas + Series. This allows zero copy operations and preservation of null values. + Subsequent operations on the resulting pandas Series may trigger conversion + to NumPy if those operations are not supported by PyArrow compute functions. + **kwargs + Additional keyword arguments to be passed to + :meth:`pyarrow.Array.to_pandas`. + + Returns + ------- + :class:`pandas.Series` + + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + + Null values are converted to `NaN`. + + >>> s = pl.Series("b", [1, 2, None]) + >>> s.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + Name: b, dtype: float64 + + Pass `use_pyarrow_extension_array=True` to get a pandas Series backed by a + PyArrow extension array. This will preserve null values. + + >>> s.to_pandas(use_pyarrow_extension_array=True) + 0 1 + 1 2 + 2 + Name: b, dtype: int64[pyarrow] + """ + def to_init_repr(self, n: int = ...) -> str: + """ + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + """ + def count(self) -> int: + """ + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + """ + def len(self) -> int: + """ + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + """ + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + """ + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + """ + def scatter( + self, + indices: Series | Iterable[int] | int | np.ndarray[Any, Any], + values: Series | Iterable[PythonLiteral] | PythonLiteral | None, + ) -> Series: + """ + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_index().select( + ... pl.when(pl.col("index") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + """ + def clear(self, n: int = ...) -> Series: + """ + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + """ + def clone(self) -> Self: + """ + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + """ + def fill_nan(self, value: int | float | Expr | None) -> Series: + """ + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + """ + def fill_null( + self, + value: Any | None = ..., + strategy: FillNullStrategy | None = ..., + limit: int | None = ..., + ) -> Series: + """ + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + """ + def floor(self) -> Series: + """ + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def ceil(self) -> Series: + """ + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + """ + def round(self, decimals: int = ...) -> Series: + """ + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + """ + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + """ + def dot(self, other: Series | ArrayLike) -> int | float | None: + """ + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + """ + def mode(self) -> Series: + """ + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + """ + def sign(self) -> Series: + """ + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + """ + def sin(self) -> Series: + """ + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + """ + def cos(self) -> Series: + """ + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + """ + def tan(self) -> Series: + """ + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + """ + def cot(self) -> Series: + """ + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + """ + def arcsin(self) -> Series: + """ + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + """ + def arccos(self) -> Series: + """ + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + """ + def arctan(self) -> Series: + """ + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + """ + def arcsinh(self) -> Series: + """ + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + """ + def arccosh(self) -> Series: + """ + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + """ + def arctanh(self) -> Series: + """ + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + """ + def sinh(self) -> Series: + """ + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + """ + def cosh(self) -> Series: + """ + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + """ + def tanh(self) -> Series: + """ + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + """ + def map_elements( + self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ... + ) -> Self: + """ + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. + If not set, the dtype will be inferred based on the first non-null value + that is returned by the function. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10, return_dtype=pl.Int64) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + """ + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + """ + def rolling_min( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Apply a rolling min (moving min) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + """ + def rolling_max( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Apply a rolling max (moving max) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + """ + def rolling_mean( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Apply a rolling mean (moving mean) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + """ + def rolling_sum( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Apply a rolling sum (moving sum) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + """ + def rolling_std( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Compute a rolling std dev. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + """ + def rolling_var( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Compute a rolling variance. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + """ + def rolling_map( + self, + function: Callable[[Series], Any], + window_size: int, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Series: + """ + Compute a custom rolling window function. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + """ + def rolling_median( + self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ... + ) -> Series: + """ + Compute a rolling median. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + """ + def rolling_quantile( + self, + quantile: float, + interpolation: RollingInterpolationMethod = ..., + window_size: int = ..., + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Series: + """ + Compute a rolling quantile. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + """ + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + """ + def sample(self, n: int | None = ...) -> Series: + """ + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + """ + def peak_max(self) -> Self: + """ + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + """ + def peak_min(self) -> Self: + """ + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + """ + def n_unique(self) -> int: + """ + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + """ + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + """ + def hash( + self, + seed: int = ..., + seed_1: int | None = ..., + seed_2: int | None = ..., + seed_3: int | None = ..., + ) -> Series: + """ + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + """ + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + """ + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + """ + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + + Examples + -------- + >>> s = pl.Series([1, -2, -3]) + >>> s.abs() + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + def rank(self, method: RankMethod = ...) -> Series: + """ + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + """ + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + """ + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + """ + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> s = pl.Series([1, 2, 2, 4, 5]) + >>> s.skew() + 0.34776706224699483 + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("grades", [66, 79, 54, 97, 96, 70, 69, 85, 93, 75]) + >>> s.kurtosis() + -1.0522623626787952 + >>> s.kurtosis(fisher=False) + 1.9477376373212048 + >>> s.kurtosis(fisher=False, bias=False) + 2.1040361802642726 + """ + def clip( + self, + lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., + upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., + ) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + """ + def lower_bound(self) -> Self: + """ + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + """ + def upper_bound(self) -> Self: + """ + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + """ + def replace( + self, + old: IntoExpr | Sequence[Any] | Mapping[Any, Any], + new: IntoExpr | Sequence[Any] | NoDefault = ..., + ) -> Self: + """ + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(old=Series(mapping.keys()), new=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + """ + def reshape(self, dimensions: tuple[int, ...]) -> Series: + """ + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + """ + def shuffle(self, seed: int | None = ...) -> Series: + """ + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + """ + def ewm_mean( + self, + com: float | None = ..., + span: float | None = ..., + half_life: float | None = ..., + alpha: float | None = ..., + ) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1, ignore_nulls=False) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + """ + def ewm_mean_by(self, by: str | IntoExpr) -> Series: + """ + Calculate time-based exponentially weighted moving average. + + Given observations :math:`x_1, x_2, \\ldots, x_n` at times + :math:`t_1, t_2, \\ldots, t_n`, the EWMA is calculated as + + .. math:: + + y_0 &= x_0 + + \\alpha_i &= \\exp(-\\lambda(t_i - t_{i-1})) + + y_i &= \\alpha_i x_i + (1 - \\alpha_i) y_{i-1}; \\quad i > 0 + + where :math:`\\lambda` equals :math:`\\ln(2) / \\text{half_life}`. + + Parameters + ---------- + by + Times to calculate average by. Should be ``DateTime``, ``Date``, ``UInt64``, + ``UInt32``, ``Int64``, or ``Int32`` data type. + half_life + Unit over which observation decays to half its value. + + Can be created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Note that `half_life` is treated as a constant duration - calendar + durations such as months (or even days in the time-zone-aware case) + are not supported, please express your duration in an approximately + equivalent number of hours (e.g. \'370h\' instead of \'1mo\'). + check_sorted + Check whether `by` column is sorted. + Incorrectly setting this to `False` will lead to incorrect output. + + Returns + ------- + Expr + Float32 if input is Float32, otherwise Float64. + + Examples + -------- + >>> from datetime import date, timedelta + >>> df = pl.DataFrame( + ... { + ... "values": [0, 1, 2, None, 4], + ... "times": [ + ... date(2020, 1, 1), + ... date(2020, 1, 3), + ... date(2020, 1, 10), + ... date(2020, 1, 15), + ... date(2020, 1, 17), + ... ], + ... } + ... ).sort("times") + >>> df["values"].ewm_mean_by(df["times"], half_life="4d") + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + 0.292893 + 1.492474 + null + 3.254508 + ] + """ + def ewm_std( + self, + com: float | None = ..., + span: float | None = ..., + half_life: float | None = ..., + alpha: float | None = ..., + ) -> Series: + """ + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1, ignore_nulls=False) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + """ + def ewm_var( + self, + com: float | None = ..., + span: float | None = ..., + half_life: float | None = ..., + alpha: float | None = ..., + ) -> Series: + """ + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` (the default) the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False`, weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True` (current default), weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1, ignore_nulls=False) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + """ + def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value or a unit expressioin with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + """ + def set_sorted(self) -> Self: + """ + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + """ + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """ + Aggregate values into a list. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.implode() + shape: (1,) + Series: \'a\' [list[i64]] + [ + [1, 2, 3] + ] + """ + def apply( + self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ... + ) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + """ + def rolling_apply( + self, + function: Callable[[Series], Any], + window_size: int, + weights: list[float] | None = ..., + min_periods: int | None = ..., + ) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def is_float(self) -> bool: + """ + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + """ + def is_integer(self, signed: bool | None = ...) -> bool: + """ + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + """ + def is_numeric(self) -> bool: + """ + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + """ + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + """ + def is_boolean(self) -> bool: + """ + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + """ + def is_utf8(self) -> bool: + """ + Check if this Series datatype is a String. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.String` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + """ + def take_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx( + self, + indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, + values: int + | float + | str + | bool + | date + | datetime + | Sequence[int] + | Sequence[float] + | Sequence[bool] + | Sequence[str] + | Sequence[date] + | Sequence[datetime] + | Series + | None, + ) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... + @property + def plot(self): ... + +def _resolve_temporal_dtype( + dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64] +) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit."""